*** CA Tracks merging and cleaning

local trackdatafile "`1'"

local price_file "`2'"
local cpi_file "`3'"
local unem_file "`4'"

*** loading and preparing unemployment rate data *******************************
import delimited using "`unem_file'" , clear
gen date2 = date(date,"YMD")
drop date
rename date2 date
format  date %td
gen m = month(date)
gen y = year(date)
drop date
save unem, replace

*** loading and preparing fuel prices data *************************************
* get monthly CPI data 
import delimited using "`cpi_file'" , clear
gen date2 = date(date,"MDY")
drop date
rename date2 date
format  date %td
gen m = month(date)
gen y = year(date)
save cpi, replace

* load price data
import delimited using "`price_file'" , clear
drop index* volume bid ask open open_i	mean currency units description
gen date_num = date(date, "MDY")
drop date
rename date_num date
format date %td

* merging in CPI
gen m = month(date)
gen y = year(date)
merge m:m y m using "cpi" , gen(_mcpi)
drop if _mcpi~=3
drop _mcpi

* get base year cpi
sum cpiaucsl if y==2011 & m==1
scalar cpi_base = r(mean)

* deflator
gen cpi_def = cpiaucsl / cpi_base

* rename 
gen low_nom = low
gen high_nom = high
gen close_nom = close

* deflate
replace low = low/cpi_def
replace high = high/cpi_def
replace close = close/cpi_def

gen type = "IFO380"
replace type = "MGO" if symbol=="AAWYR00"
replace type = "MDO" if symbol=="POACP00"

drop symbol

reshape wide low* high* close* , j(type) i(date) string

egen IFO = rmean(lowIFO380  highIFO380) // mean of IFO high and low
egen MDO = rmean(lowMDO  highMDO) // mean of MDO high and low
egen MGO = rmean(lowMGO  highMGO) // mean of MGO high and low

* MDO and MGO overlap for about 4 months (sept 2012 to dec 2012) prices are identical
gen LS = MDO
replace LS = MGO if MDO==.

* this substracts dow from date ind
* therefor it gives an indicator for day 0 of the week of each observation
gen week_ind = date - dow(date)
format week_ind %td

collapse IFO MDO MGO LS , by(week_ind)

* collapsing to weekly averages
*** only have work week observations
*** not clear when boats actually fuel

gen LSprem  = LS - IFO
gen LSprem_perc = LSprem/IFO
label var LSprem "LS Premium"


sum LSprem_perc if week_ind>=mdy(1,1,2008) & week_ind<=mdy(1,1,2016)
sum LSprem_perc if week_ind>=mdy(7,1,2009) & week_ind<=mdy(8,1,2012)


*twoway (line LSprem week_ind)    if week_ind>=mdy(1,1,2008)
save "fuelPrices", replace

********************************************************************************
* loading track data
import delimited using "`trackdatafile'" , clear asfloat
*use "`trackdatafile'" , replace 
*drop m y
*destring aisyear dist , replace

* adding track id
drop id_track track_id
egen track_id = group(oid aisyear)

* recoding missing values
mvdecode * , mv(-9999)

* cleaning length and width (0 is missing)
mvdecode length width , mv(0)

* encoding group
encode group, gen(group_i) 
label list group_i

encode group_agg, gen(group_agg_i) 
label list group_agg_i

* cleaning imo number
* destring if it is a string
capture confirm string var imo
if _rc==0 {
	replace imo="" if imo=="-9999"
	replace imo = subinstr(imo , "IMO" , "" , . ) 
	destring imo , replace
}

* cleaning up any imo numbers with more than 7 digits (with 0 at end)
* clearing last digit(s) if zero
gen imo_div = imo/1e7
replace imo = . if imo_div < 0.1 // removes any imo numbers that don't have 7 digits

replace imo = imo/10 if imo_div>1 & imo_div<10 & mod(imo,10)==0
replace imo = imo/100 if imo_div>10 & imo_div<100 & mod(imo,100)==0

replace imo_div = imo/1e7
replace imo = . if imo_div > 1 // removes any imo numbers that have more than 7

drop imo_div

* renaming
*rename shape_length dist
rename time_eca2009 hours_eca2009
rename time_eca2011 hours_eca2011

*rename portcode_sn1 portSN1 
*rename portcode_sn2 portSN2 

* dealing with dates/times
split time1 , p(" ")
split time2 , p(" ")

gen date1 = date(time11, "YMD")
gen date2 = date(time21, "YMD")
format date1 %td
format date2 %td

gen clock1 = clock(substr(time12,1,8),"hms")
gen clock2 = clock(substr(time22,1,8),"hms")
format clock1 %tcHH:MM:SS
format clock2 %tcHH:MM:SS

generate dt1 = dhms(date1,hh(clock1),mm(clock1),ss(clock1))
generate dt2 = dhms(date2,hh(clock2),mm(clock2),ss(clock2))
format dt1 %tcNN/DD/CCYY_HH:MM:SS
format dt2 %tcNN/DD/CCYY_HH:MM:SS

drop time1 time2 time12 time21 time11 time22

* generating year, month, and month*year indicators
* rule for date is 
* 1) use earliest date
* 2) unless earlier date is 
gen WC1 = port1<=12 //
gen WC2 = port2<=12 

gen date = date1
replace date = date2 if (WC1==0 & WC2==1)

gen dt = dt1
replace dt = dt2 if (WC1==0 & WC2==1) 
format dt %tcNN/DD/CCYY_HH:MM:SS

gen m = month(date)
gen y = year(date)
gen dow1 = dow(date)
gen q = quarter(date)
gen my = ym(y,m)
format my %tm 
gen qy = yq(y,q) // year by quarter
format qy %tq
gen t = date - mdy(1,1,2009)
gen t2 = t*t

* generate week identifier to merge in fuel prices
gen week_ind = date - dow(date)
format week_ind %td

gen smy = string(my,"%tm")
labmask my , val(smy)
drop smy

gen mmsi_scrambled = (aisyear>=2010) & (aisyear<=2014)

/*
* generating vessel IDs that reflect the mmsi scrambling in 2010-2014
* also allowing for same mmsi but different IMO in years when we have that
* generate a unique vessel identifier
gen mmsi_scrambled = (aisyear>=2010) & (aisyear<=2014)
mvencode imo, mv(-999)
egen ves_id = group(mmsi mmsi_scrambled imo)
mvdecode imo, mv(-999=.)
*/

* generating unique vessel ids
* using IMO if available
* otherwise use mmsi, but accounting for scrambling
egen uniq_mmsi = group(mmsi mmsi_scrambled)
gen ves_id_tmp = imo
replace ves_id_tmp = uniq_mmsi/1000000 if ves_id_tmp==.
egen ves_id = group(ves_id_tmp)

rename vesseltype vesselType_detailed
*encode vessel_str, gen(vesseltype) 
*label list vesseltype
*drop vessel_str

* generating a hazardous category variable
* only really useful for 2009-until updated group numbers are used (4 digit ones)
gen hazard_cat = 0 
replace hazard_cat = 1  if inlist(vesselType_detailed, 21, 41, 61, 71, 81, 91)
replace hazard_cat = 2  if inlist(vesselType_detailed, 22, 42, 62, 72, 82, 92)
replace hazard_cat = 3  if inlist(vesselType_detailed, 23, 43, 63, 73, 83, 93)
replace hazard_cat = 4  if inlist(vesselType_detailed, 24, 44, 64, 74, 84, 94)
label var hazard_cat "Hazardous Category"
label define hazard_cat 0 "None" 1 "Major Hazard" 2 "Hazard" 3 "Minor Hazard" 4 "Recog Hazard"
label values hazard_cat hazard_cat

* https://help.marinetraffic.com/hc/en-us/articles/205579997-What-is-the-significance-of-the-AIS-Shiptype-number-
*category A = IMO category X - major hazard (no discharge)
*category B = IMO category Y - hazard (limit discharge)
*category C = IMO category Z - minor hazard (less stringent limits)
*category D = IMO category OS - recognisable hazard

gen hazard_ind = (hazard_cat>0) 
label var hazard_ind "Hazardous"


* flag for vessels coming from historic wfr
*rename from_hist hist_wfr_only  

/*
tab hist_wfr_only y
distinct mmsi if hist_wfr_only==1
tab hist_wfr_only y if merged_chars==1
*/

/*
***** WRITE SUMMARY STATS ******************************************************
egen ves_y_tag = tag(ves_id y) // sums to total vessel by year
egen ves_tag = tag(ves_id)    // sums to total vessel
gen ves_y_tag_noimo = ves_y_tag*(~has_imo) // sums to total vessels by year without imo
gen ves_y_tag_wfr = ves_y_tag*(merged_chars) // sums to total vessels by year with WFR
gen ves_y_tag_imo_nowfr = ves_y_tag*(~merged_chars)*has_imo // sums to total vessels by year with imo but not in wfr
gen merged_chars100 = merged_chars*100

/*
preserve
keep if has_imo & ~merged_chars
keep imo
duplicates drop imo, force 
export delimited using "C:\Users\rklotz\OneDrive\Work\Ports_ECAs\marine_traffic_scrape\imos_to_scrape.csv"
restore
*/

***** summary stats for data creation 
* Tracks, vessels, etc over time

tabout y using merge_summary_stats.tex if vesseltype=="Cargo":vesseltype , oneway replace sum ///
	cells(N dist mean merged_chars100 sum ves_y_tag sum ves_y_tag_wfr sum ves_y_tag_noimo sum ves_y_tag_imo_nowfr) ///
	format( 0c p1 0c 0c 0c 0c  ) ptotal(none) h2( \textbf{Cargoships}\\  & Tracks & Tracks 	& Vessels & Vessels   & Vessels 	& Vessels \\ ) /// 
											  h3( &        & w. chars.  &  		  & w. chars. &  no IMO 	& no WFR  \\ ) ///
	style(tex) bt topf($top_tex) 

tabout y using merge_summary_stats.tex if vesseltype=="Tanker":vesseltype , oneway append sum ///
	cells(N dist mean merged_chars100 sum ves_y_tag sum ves_y_tag_wfr sum ves_y_tag_noimo sum ves_y_tag_imo_nowfr) ///
	format( 0c p1 0c 0c 0c 0c  ) ptotal(none) h2(\\ \textbf{Tankers} \\ )  h3(nil) ///
	style(tex) bt botf($bot_tex) 
	
drop ves_y_tag ves_tag ves_y_tag_noimo ves_y_tag_wfr ves_y_tag_imo_nowfr merged_chars100
********************************************************************************
*/

* clean up length and width for vessels without WFR data
rename length length_ais
rename width width_ais


* use wfr data for length and width if have data from WFR
gen length = loa if merged_chars==1
gen width = beam if merged_chars==1


/*
* throw error if any WFR vessels missing length and width
* originally there were none
sum length if length==. & merged_chars==1
if r(N)>0 {
	error
}
*/

* if only in AIS then take modal length and width
replace length_ais=. if length_ais>400 | length_ais<70
replace width_ais=. if width_ais>60 | width_ais<10
*egen sd_length = sd(length_ais) if merged_chars==0, by(ves_id)
egen mode_length = mode(length_ais) if merged_chars==0 , by(ves_id) maxmode
egen mode_width = mode(width_ais) if merged_chars==0 , by(ves_id) maxmode
replace length = mode_length if merged_chars==0 
replace width = mode_width if merged_chars==0

drop mode_width mode_length 


* clean AIS type (might not be used) *******************************************
* count number of vessel types (none other or NA) for each vessel id
/*
gen type_noNA = vesseltype if ~(vesseltype=="NA":vesseltype)
egen tag = tag(ves_id type_noNA) // tag each ves_id and type
egen types = total(tag), by(ves_id) // count number of tags

* if types>0 - 1 or more non-NA type
* use modal type
egen modal_type = mode(vesseltype) , minmode by(ves_id)
gen type2 = modal_type if types>0
replace type2 = "NA":vesseltype if types==0
label value type2 vesseltype

rename vesseltype vesseltype_ais
rename type2 vesseltype

drop types modal_type tag type_noNA
*/

* vessel variables
rename powertype power_str
encode power_str, gen(powertype) 
label list powertype

rename mainenginefueltype fuel_str
encode fuel_str, gen(fueltype) 
label list fueltype

* generating vessel power in horsepower
gen power_hp = ( power_kw*1.34102 ) / 1000
label var power_hp "Power (1000 HP)"

* getting marine identification digits and US flagged ships ********************
*gen USflag = inlist(mid, 338, 366, 367, 368, 369)
gen USflag = inlist(flag_country_mid , "United States of America" )
label var USflag "US Flag"


* calculating duration and avg speed
gen hours = (dt2-dt1)/(60000*60)
/*
gen avg_speed = dist/hours
*gen speed_kmh_imp = dist/hours

* calculating distance/time/speed out of ECA
** removing negative distances, b/c these are all very small (1/10 of km)
** also removing negative times, b/c these were very small
** these tracks are effectively always out of ECA, just geoprocessing or rounding issue

gen dist_out2009 = max( 0 , dist - dist_eca2009 ) 
gen dist_out2011 = max( 0 , dist - dist_eca2011 ) 

gen hours_noeca2009 = max( 0, hours - hours_eca2009 ) 
gen hours_noeca2011 = max( 0, hours - hours_eca2011 ) 

gen noeca2009_ind = (dist_out2009<2) | (hours_noeca2009<1) 
gen noeca2011_ind = (dist_out2011<2) | (hours_noeca2011<1) 

*replace dist_eca2009=. if date1>= mdy(2,1,2012)
*replace dist_eca2011=. if date1<= mdy(8,1,2011)

*replace dist_out2009=. if date1>= mdy(2,1,2012)
*replace dist_out2011=. if date1<= mdy(8,1,2011)

* THESE ARE NOT CORRECT - SEEMS TO BE MISCALCULATED IN ARCPY SCRIPT
gen avg_speed_eca2009 = dist_eca2009/hours_eca2009
gen avg_speed_eca2011 = dist_eca2011/hours_eca2011 

* only generating speed for distances greater than a kilometer
gen avg_speed_noeca2009 = dist_out2009/hours_noeca2009 if noeca2009_ind~=1
gen avg_speed_noeca2011 = dist_out2011/hours_noeca2011 if noeca2011_ind~=1

* distance in 1000 km
gen dist1000 = dist/1000 	
gen dist_eca2009_1000 = dist_eca2009/1000
gen dist_eca2011_1000 = dist_eca2011/1000
gen dist_out2009_1000 = dist_out2009/1000
gen dist_out2011_1000 = dist_out2011/1000

* getting fuel consumption per km (t/km)
gen tkm_cons = f_cons / dist 
gen tkm_power = f_power / dist 
gen tkm_eca09_cons = f_eca09_cons / dist_eca2009 
gen tkm_eca09_power = f_eca09_power / dist_eca2009
gen tkm_eca11_cons = f_eca11_cons / dist_eca2011 
gen tkm_eca11_power = f_eca11_power / dist_eca2011

gen tkm_out09_cons = f_out09_cons / dist_out2009 
gen tkm_out09_power = f_out09_power / dist_out2009
gen tkm_out11_cons = f_out11_cons / dist_out2011 
gen tkm_out11_power = f_out11_power / dist_out2011


* labeling
label variable dist "Distance (km)"
label variable dist_eca2009 "2009 ECA"
label variable dist_eca2011 "2011 ECA"
label variable dist_eca2009_1000 "2009 ECA"
label variable dist_eca2011_1000 "2011 ECA"
label variable dist_out2009_1000 "2009 ECA"
label variable dist_out2011_1000 "2011 ECA"

label variable avg_speed "Speed (km/h)"
label variable avg_speed_eca2009 "In 2009 ECA"
label variable avg_speed_eca2011 "In 2011 ECA"
label variable avg_speed_noeca2009 "Out 2009 ECA"
label variable avg_speed_noeca2011 "Out 2011 ECA"
*/

label variable length "Length (m)"
label variable width "Width (m)"
 
label var length_ais "Length (m)"
label var width_ais "Width (m)"


* merging in fuel prices *******************************************************
merge m:1 week_ind using "fuelPrices" , gen(_mprices)
drop if _mprices==2
drop _mprices

* merging in unemployment rate**************************************************
merge m:1 m y using "unem" , gen(_munem)
drop if _munem==2
drop _munem

/*
* adding value labels to port
label define portSN 1	"Hilo" ///
2	"Kahalui" ///
3	"Nawil" ///
4	"Honolulu" ///
5	"PHarbor" ///
6	"BPoint" ///
7	"Ensen" ///
8	"San Diego" ///
9	"LA/LB" ///
11	"Hueneme" ///
12	"San Francisco" ///
13	"HB" ///
14	"CB" ///
15	"Portland" ///
16	"GH" ///
17	"Seattle" ///
18	"Alb" ///
19	"PWSound" ///
20	"CookIn" ///
21	"Unimak Pass"
						 
label values portSN1 portSN
label values portSN2 portSN
*/

encode route_name, gen(route_id) 
label list route_id

encode route_name_interp, gen(route_id_interp) 
label list route_id_interp


********************************************************************************
/*
* now creating a route variable that has exiting/entering as its own category
* using portSN because it is indexed south to north
replace minportSN = 100 if minportSN>100 & minportSN~=.
replace maxportSN = 100 if maxportSN>100 & maxportSN~=.
replace minportname = "USbound" if minportSN==100
replace maxportname = "USbound" if maxportSN==100

* generate text, then make into categorical
gen route_name = minportname + "_" + maxportname if classified==1
label variable route_name "Routes"
encode route_name, gen(route_id) 
label list route_id
drop minportSN maxportSN minportname maxportname
*/

* routes - for broader aggregation of ports
egen port_agg1_id = group(port_agg1) , label
egen port_agg2_id = group(port_agg2) , label
rowsort port_agg1_id port_agg2_id, gen(svar1 svar2)
egen route_agg = group(svar*) , label
drop svar*


* southern exits
*gen south = port_agg1=="USbound_S" | port_agg2=="USbound_S"
gen south = port1==101 | port2==101
gen south_broad = inlist(port1,101,102) | inlist(port2,101,102)
gen west = inlist(port1,103,104,105,106) | inlist(port2,103,104,105,106)
gen north = inlist(port1,107,108,109) | inlist(port2,107,108,109)

* combine LB and LA
** I did this by modifying the excel sheet with the port labels in in
** Can recover with location if desired using port1 and port2

*** Classifying tracks *************
/*
* indicators for port fixed effects (unused??)
gen port1_fe = port1 if port1~=.
replace port1_fe = 0 if port1_fe>100 & port1_fe~=.
gen port2_fe = port2 if port2~=.
replace port2_fe = 0 if port2_fe>100 & port2_fe~=.
*/

* flag for route to AK or HI
gen HIport = inlist(port_agg1,"HI") | inlist(port_agg2,"HI") 
gen AKport = inlist(port_agg1,"AK","Unimak") | inlist(port_agg2,"AK","Unimak") 
gen otherAK = inlist(port1,15,16) | inlist(port2,15,16)
gen HItoHI = inlist(port_agg1,"HI") & inlist(port_agg2,"HI") 
gen AKtoAK = inlist(port_agg1,"AK","Unimak") & inlist(port_agg2,"AK","Unimak") 

* dummy for whether route moves between two ports
gen port2port = (port1<100 & port2<100) 

* indicator for offshore route
gen offshore = inlist(route_name,"offECAS_offECAN","EguadS_EguadN","WguadS_WguadN","BajaS_BajaN","BajaN_US1","WguadN_offECAS")

gen WCport = (port1<=12 | port2<=12) // all west coast ports (minus ensenada)


* determine if track is to/from CA port
local CAports "1,2,3,4,5,12"
gen CAport = inlist(port1,`CAports') | inlist(port2,`CAports')
gen port2port_CA = inlist(port1,`CAports') & inlist(port2,`CAports') 

* determine if track is to/from a major port
local MAJORports "1,3,4,6,7,14,17"
gen MAJORport = inlist(port1,`MAJORports') | inlist(port2,`MAJORports')
gen port2port_MAJOR = inlist(port1,`MAJORports') & inlist(port2,`MAJORports') // if track ONLY moves between major ports

* Routes to LA/LB
local LALBports "3,4"
gen LALB = inlist(port1,`LALBports') | inlist(port2,`LALBports')

* Other Ports
gen SFBay = inlist(port1,1) | inlist(port2,1)
gen SD = inlist(port1,2) | inlist(port2,2)
gen HUE = inlist(port1,5) | inlist(port2,5)
gen SEA_POR = inlist(port1,6,7) | inlist(port2,6,7)
gen POR = inlist(port1,6) | inlist(port2,6)
gen SEA = inlist(port1,7) | inlist(port2,7)

* classifying all tracks to LA/LB, other CA, other WC
local CAports "1,2,3,4,5,12"
local SOCALport "2,3,4,5"
gen port_agg = "SoCal" if inlist(port1,`SOCALport') | inlist(port2,`SOCALport') // any track to southern california
replace port_agg = "NoCal" if port_agg=="" & ( inlist(port1,`CAports') | inlist(port2,`CAports') ) // any track to california, but not SoCal
replace port_agg = "OtherWC" if port_agg=="" & ( port1<12  | port2<12 )

*capture drop port_agg_i
gen port_agg_i = 1 if port_agg=="SoCal"
replace port_agg_i = 2 if port_agg=="NoCal"
replace port_agg_i = 3 if port_agg=="OtherWC"

capture label drop port_agg_i
label define port_agg_i 1 "So. Cal" ///
2	"No. Cal" ///
3	"Other WC" 
label values port_agg_i port_agg_i


* update route type
* coastal
* enter-exits
* interpolated (long)
	* flag interpolated that were part of enter-exits
	* might have some that are not included in enter-exits

* route type 
* "coastal" is west coast to west coast port
* "EnterExit" is study area boundary to west coast
* "LongInterp" is connection from west coast to AK or HI - which are subset of EnterExit
gen route_type = "Coastal" if port2port==1 & ~(HIport==1 | AKport==1)
replace route_type = "EnterExit" if WCport==1 & (port1>100 | port2>100)
replace route_type = "LongInterp" if ( port2port==1 & (HIport==1 | AKport==1) ) & AKtoAK==0 & HItoHI==0

gen route_type_i = 1 if route_type=="Coastal"
replace route_type_i = 2 if route_type=="EnterExit"
replace route_type_i = 3 if route_type=="LongInterp"
capture label drop route_type_i
label define route_type_i 1 "Port-to-Port" ///
2	"Ent/Exit" ///
3	"Long Interp" 
label values route_type_i route_type_i

gen enterexit = (route_type=="EnterExit")
gen porttoport = (route_type=="Coastal")

/*
* route type
* 1 is west coast to west coast (call these short)
* 2 is connection from west coast to AK or HI (call these long)
* 3 is offshore
gen route_type = 1 if port2port==1 & ~(HIport==1 | AKport==1)
replace route_type = 2 if port2port==1 & (HIport==1 | AKport==1)
replace route_type = 3 if offshore==1
tab route_name route_type , miss
*/

* determine if track is leaving 100nm boundary
*gen exiting = (port1>100 & port1<=110) | (port2>100 & port2<=110) if WCport==1
*label define lab_exiting 0 "Port to Port" 1 "Port to Boundary"
*label values exiting lab_exiting

* port if exiting
*gen port_ifexit = port1_name if exiting==1
*replace port_ifexit = port2_name if port2<100  & exiting==1
*egen port_ifexit_i = group(port_ifexit)

* determing if track through SB channel and other waypoints
gen LBind = (port1==4) | (port2==4) // creating indicator for LB as opposed to LA
gen SBind = (strpos(port_id, "51")>0)
gen SF_TSS_S = (strpos(port_id, "52")>0)
gen SF_TSS_W = (strpos(port_id, "53")>0)
gen SF_TSS_N = (strpos(port_id, "54")>0)
gen LA_TSS_S = (strpos(port_id, "55")>0)
gen LA_TSS_W = (strpos(port_id, "56")>0)

* crude terminals and other intermediate ports
replace port_id = subinstr(port_id,"130","9999",.)
* updating this - includes both methods for flagging these stops
* elsegundo, rosarito are based on grid, port_id based on track creation
gen crude_term = (elsegundo==1) | (rosarito==1) | (strpos(port_id, "57")>0) | (strpos(port_id, "58")>0) // stops at elsegundo or rosarito
gen inter_port_ind = (crude_term==1) | (strpos(port_id, "13")>0) | (ensenada==1) 
replace port_id = subinstr(port_id,"9999","130",.)												  
label var SBind "SB Channel"

label define sf_tss_w 0 "Other" 1 "TSS W"
label values SF_TSS_W sf_tss_w

gen elsegundo_ind = (strpos(port_id, "57")>0)
gen rosarito_ind = (strpos(port_id, "58")>0)

* whether a 100nm boundary is in track's port list - can use to identify vessels moving much farther from coast.
gen leaves100nm = (strpos(port_id, "101")>0) | (strpos(port_id, "102")>0) | (strpos(port_id, "103")>0) | ///
				(strpos(port_id, "104")>0) | (strpos(port_id, "105")>0) | (strpos(port_id, "106")>0) | ///
				(strpos(port_id, "107")>0) | (strpos(port_id, "108")>0) | (strpos(port_id, "109")>0) 

* trips that start and end at same port 
gen samePort = (port1==port2)

* starting ending dates for port slowdown according to moore et al
local portslowStart = mdy(3,1,2014) // Trying to be cautious (starting congestion at LA/LB much sooner
*local portslowStart = mdy(10,1,2014) // contract expires
*local portslowEnd =   mdy(2,20,2015) // contract resolved
local portslowEnd =   mdy(5,20,2015) // allow 3 months extra to resolve backlog

gen portslow = (date1>=`portslowStart') & (date1<=`portslowEnd')

*** ship size bins 
egen length_bin = cut(length), at(50(50)400)

* which vessel types to include (2,8 are cargo and tankers)
*gen incVessel = inlist(vesseltype,2,8)


*label var cargo "Cargo"
*label var USflag "US Flag"
			   
* route by year indicators
egen ry = group(route_id y)



* Generating some other vessel characteristics

gen twostroke = (powertype=="Diesel 2-Stroke":powertype)
label variable twostroke "2-Stroke Diesel"
gen steam  = (powertype=="Steam Turbine":powertype)
label variable steam "Steam"

gen old = (built<=1999) if built~=.			   
gen FCC = (group=="FCC") if group~=""	
label variable FCC "Container"
label define FCC 0 "Other" 1 "FCC"
label values FCC FCC

* creating indicator for vessel type used in regressions
gen vesseltype_reg = 1 if group_agg_i=="FCC":group_agg_i 
replace vesseltype_reg = 2 if (group_agg_i=="Bulker":group_agg_i) | (group_agg_i=="Other Cargo":group_agg_i)
replace vesseltype_reg = 3 if (group_agg_i=="Tankers":group_agg_i) 
replace vesseltype_reg = 4 if (group_agg_i=="Passenger":group_agg_i) 
label define vesseltype_reg 1 "Container" 2 "Other Cargo" 3 "Tanker"  4 "Passenger" 
label values vesseltype_reg vesseltype_reg
decode vesseltype_reg , gen(vesseltype_regstr)

gen other_cargo = (vesseltype_reg=="Other Cargo":vesseltype_reg) & vesseltype_reg~=.
label variable other_cargo "Other Cargo"

gen tanker = (vesseltype_reg=="Tanker":vesseltype_reg) & vesseltype_reg~=.
label variable tanker "Tankers"

/*
gen PCC = (group=="PCC") if group~=""	
label variable PCC "Car Carrier"

gen Bulk = (group=="Bulk Carrier") if group~=""	
label variable Bulk "Bulker"
*/

* getting indicator for exempt vessels for CA ECA
* >=400ft (121 meters)
* >= 10,000 GT
* >= 30 liter per cylinder displacement
* not steam 
*gen exempt = ( (steam==1) | ~( length>=121 | gt>=10000 ) ) if steam~=. & length~=. & gt~=. 
*gen exempt_v2 = ( (steam==1) | ~( length>=121 | gt>=10000 | enginedispl_cul>=30 ) ) if steam~=. & length~=. & gt~=. 
gen exempt = (steam==1) | ( length<121 & gt<10000 & enginedispl_cul<30 )  if steam~=. & length~=. & gt~=. 
label variable exempt "Exempt (CA)"

gen treat = (exempt~=1)
label variable treat "Treated (CA)"


**** Policy Dates and Changes
* Policy dates *****************************************************************
scalar ecaCA1_scal = mdy(7,1,2009) /* CA ECA put in place */
scalar ecaCA2_scal = mdy(12,1,2011) /* CA ECA boundary change */
scalar ecaCA3_scal = mdy(1,1,2014) /* CA ECA stringency (.1%) */
scalar ecaNA1_scal = mdy(8,1,2012) /* NA ECA (1%), CA stringency change (1%) */
scalar ecaNA2_scal = mdy(1,1,2015) /* NA ECA stringency (0.1%)*/

* policy dummies
gen ecaCA1 = (date>=ecaCA1_scal)
gen ecaCA2 = (date>=ecaCA2_scal) 
gen ecaCA3 = (date>=ecaCA3_scal) 
gen ecaNA1 = (date>=ecaNA1_scal)
gen ecaNA2 = (date>=ecaNA2_scal)

* eca month indcators (for month plots)
scalar startmonth = ym( year(2009),month(1) )
scalar ecaCA1month = ym( year(ecaCA1_scal),month(ecaCA1_scal) )
scalar ecaCA2month = ym( year(ecaCA2_scal),month(ecaCA2_scal) )
scalar ecaCA3month = ym( year(ecaCA3_scal),month(ecaCA3_scal) )
scalar ecaNA1month = ym( year(ecaNA1_scal),month(ecaNA1_scal) )
scalar ecaNA2month = ym( year(ecaNA2_scal),month(ecaNA2_scal) )

* eca quarter indcators (for month plots)
scalar ecaCA1quarter = yq( year(ecaCA1_scal),quarter(ecaCA1_scal) )
scalar ecaCA2quarter = yq( year(ecaCA2_scal),quarter(ecaCA2_scal) )
scalar ecaCA3quarter = yq( year(ecaCA3_scal),quarter(ecaCA3_scal) )
scalar ecaNA1quarter = yq( year(ecaNA1_scal),quarter(ecaNA1_scal) )
scalar ecaNA2quarter = yq( year(ecaNA2_scal),quarter(ecaNA2_scal) )

* this is indicator for different periods of ECA
gen eca_ind = 0
replace eca_ind = 1 if ecaCA1==1 & ecaCA2==0 
replace eca_ind = 2 if ecaCA2==1 & ecaNA1==0 
replace eca_ind = 3 if ecaNA1==1 & ecaCA3==0
replace eca_ind = 4 if ecaCA3==1 & ecaNA2==0
replace eca_ind = 5 if ecaNA2==1 
 
disp ecaCA2month - ecaCA1month
disp ecaNA1month - ecaCA2month
disp ecaCA3month - ecaNA1month
disp ecaNA2month - ecaCA3month 

label define lab_eca_ind 0 "No ECA" 1 "CA (1.5%), 2009" 2 "CA (1.5%), 2011" ///
				   3 "CA(1%) NA(1%)" 4 "CA(0.1%) NA(1%)" 5 "CA(0.1%) NA(0.1%)" 
label define lab_eca_ind_shrt 0 "No ECA" 1 "CA 2009" 2 "CA 2011" ///
				   3 "CA+NA(1%)" 4 "CA(0.1%)+NA" 5 "CA+NA(0.1%)" 
label define lab_eca_ind_ca 0 "No ECA" 1 "CA 2009" 2 "CA 2011" ///
				   3 "CA 1%" 4 "CA 0.1%" 5 "CA 0.1%" 
label define lab_eca_ind_na 0 "" 1 "" 2 "" ///
				   3 "NA 1%" 4 "NA 1%" 5 "NA 0.1%" 
				   
* labels are CA(1%) indicating level of MGO, did this to save room and because MDO limit only changes when MGO one does
label values eca_ind lab_eca_ind

* this is indicator for different periods of ECA (collapsing 2011 ECA boundary change and 0.1% reduction in CA)
gen eca_ind4 = 0
replace eca_ind4 = 1 if ecaCA1==1 & ecaCA2==0 
replace eca_ind4 = 2 if ecaCA2==1 & ecaNA1==0
replace eca_ind4 = 3 if ecaNA1==1 & ecaNA2==0
replace eca_ind4 = 4 if ecaNA2==1 

label define lab_eca_ind4 0 "No ECA" 1 "CA (1.5%), 2009" 2 "CA (1.5%), 2011" ///
				   3 "CA NA(1%)" 4 "CA(0.1%) NA(0.1%)" 
label values eca_ind4 lab_eca_ind4
				   
* making set of dummies consistent with eca_ind
forvalues v = 1(1)5 {
	gen eca`v' = (eca_ind==`v')
	local lab : label lab_eca_ind `v'
	label var eca`v' "`lab'"
} 

* making set of dummies consistent with eca_ind
forvalues v = 1(1)4 {
	gen eca4_`v' = (eca_ind4==`v')
	local lab : label lab_eca_ind4 `v'
	label var eca4_`v' "`lab'"
} 



* Labels ***********************************************************************
label variable eca_ind "Policy Treatment"
*label variable port_ifexit "Port"
label variable route_name "Route"



*** Entrance/Clearance Ports 

* groupings
forvalues i = 1(1)2 {
	*capture drop port_name`i'_ec
	gen port_name`i'_ec = port_name`i'_ecport 
	replace port_name`i'_ec = "South" ///
			if ( inlist(port_name`i'_ecregion,"CAm","SAm","US_EC","Af","Eur") ) | /// destination with south if CAm, SAm, Europe, US East Coast
			( inlist(port_name`i'_ecregion,"NAm") & ~inlist(port_name`i'_eccountry,"Canada, WC","US_WC") ) //  destination with south if Mexico, or Canadian EC
	replace port_name`i'_ec = "SEAsia_MidEast"  /// 
		if inlist(port_name`i'_ecregion,"MidE","Asia")
	replace port_name`i'_ec = port_name`i'_eccountry  /// 
		if inlist(port_name`i'_eccountry,"China South","China Central","China North","China Taiwan","Hong Kong","Japan","South Korea","Russia, EC")
	replace port_name`i'_ec = port_name`i'_eccountry  /// 
		if inlist(port_name`i'_eccountry,"Canada, WC","Sea")  // grouping canadian, WC ports 
	replace port_name`i'_ec = port_name`i'_ecregion  /// 
		if inlist(port_name`i'_ecregion,"Oce")  // grouping oceania
	* group Taiwan in with China Central and Hong Kong into China South
	replace port_name`i'_ec = "China South"  /// 
		if inlist(port_name`i'_eccountry,"Hong Kong")  // grouping oceania	
	replace port_name`i'_ec = "China Central"  /// 
		if inlist(port_name`i'_eccountry,"China Taiwan")  // grouping oceania	
}



* owners/operators
*** groups are somewhat broader than owner/operator variables
egen owner_group_i = group(owner_group) , label
egen operator_group_i = group(operator_group) , label

* enter/exit routes to ports
rowsort port_name1_ecport  port_name2_ecport , gen(svar1 svar2)
egen route_ecport = group(svar*) , label
drop svar*

* enter/exit routes to countries
rowsort port_name1_eccountry port_name2_eccountry , gen(svar1 svar2)
egen route_eccountry = group(svar*) , label
drop svar*

* enter/exit routes to regions
rowsort port_name1_ecregion port_name2_ecregion , gen(svar1 svar2)
egen route_ecregion = group(svar*) , label
drop svar*

* enter/exit routes grouped
rowsort port_name1_ec port_name2_ec , gen(svar1 svar2)
egen route_ec = group(svar*) , label
drop svar*

* E/C south definition
gen south_ec = (port_name1_ec=="South") | (port_name2_ec=="South")
gen japan_ec = (port_name1_ec=="Japan") | (port_name2_ec=="Japan")
gen skorea_ec = (port_name1_ec=="South Korea") | (port_name2_ec=="South Korea")
gen china_ec = strpos(port_name1_ec, "China") | strpos(port_name2_ec, "China")
gen oce_ec = strpos(port_name1_ec, "Oce") | strpos(port_name2_ec, "Oce")

* flagging big outliers
* top 1%

* collapsing first
preserve
collapse (first) dist route_id vesseltype_regstr , by(track_id)
egen dist_pctile = xtile(dist) , by(route_id vesseltype_regstr) p(99)
gen dist_outlier = (dist_pctile~=1) 
keep track_id dist_pctile dist_outlier
save "dist_outlier_track" , replace
restore
merge m:1 track_id using "dist_outlier_track"
drop _merge
*hist dist if route_id_dtl=="SFBay_Port":route_id_dtl

* using entrance/clearance routes
preserve
collapse (first) dist route_ec vesseltype_regstr , by(track_id)
egen dist_pctile_ec = xtile(dist) , by(route_ec vesseltype_regstr) p(99)
gen dist_outlier_ec = (dist_pctile_ec~=1) 
keep track_id dist_pctile_ec dist_outlier_ec
save "dist_outlier_track" , replace
restore
merge m:1 track_id using "dist_outlier_track"
drop _merge

